Code
pheno_clean <- readRDS("data/pheno_clean.rds")
expr_data <- readRDS("data/expr_df_matched.rds")Place a subtitle here if needed
pheno_clean <- readRDS("data/pheno_clean.rds")
expr_data <- readRDS("data/expr_df_matched.rds")library(ggplot2)
library(factoextra)Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# Make sure both are character
colnames(expr_data) <- as.character(colnames(expr_data))
rownames(pheno_clean) <- as.character(rownames(pheno_clean))
# Find common sample IDs
common_samples <- intersect(colnames(expr_data), rownames(pheno_clean))
# Subset and align both
expr_matched <- expr_data[, common_samples]
pheno_matched <- pheno_clean[common_samples, ] # works since rownames match
expr_t <- t(expr_matched)
# Run PCA
pca_res <- prcomp(expr_t, scale. = TRUE)
# Prepare PCA plot data
pca_df <- as.data.frame(pca_res$x[, 1:2])
pca_df$batch <- pheno_matched$batch
# Plot PCA
library(ggplot2)
ggplot(pca_df, aes(PC1, PC2, color = batch)) +
geom_point(alpha = 0.7, size = 2) +
stat_ellipse(type = "norm", size = 1.2) + # Draw ellipses for each batch
scale_color_manual(values = c("ILLUMINATE-1" = "purple", "ILLUMINATE-2" = "yellow")) +
theme_minimal() +
labs(title = "PCA Colored by Batch with Ellipses",
subtitle = "No strong separation suggests minimal batch effect")Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
This PCA plot shows overlapping distributions of ILLUMINATE-1 and ILLUMINATE-2 samples. The ellipses capture the spread of each batch along the top two principal components. The lack of clear separation between ellipses suggests no dominant batch effect.
Boxplots by Batch (for a few top genes)
library(ggplot2)
library(reshape2)
# Use top 10 most variable genes for easier plotting
vars <- apply(expr_matched, 1, var)
top_genes <- names(sort(vars, decreasing = TRUE))[1:10]
expr_subset <- expr_matched[top_genes, ]
# Transpose and prepare for plotting
expr_df <- as.data.frame(t(expr_subset))
expr_df$batch <- pheno_matched$batch
# Melt for ggplot
expr_long <- melt(expr_df, id.vars = "batch", variable.name = "Gene", value.name = "Expression")
# Plot
ggplot(expr_long, aes(x = Gene, y = Expression, fill = batch)) +
geom_boxplot(outlier.size = 0.5, position = "dodge") +
theme_minimal() +
labs(title = "Boxplots of Expression by Batch", x = "Top Variable Genes") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))No major batch effect — there’s no consistent shift in expression levels between ILLUMINATE-1 and ILLUMINATE-2.
Differences, where they exist, are gene-specific and small.
# Categorize SLEDAI score
pheno_clean$sledai_group <- cut(
pheno_clean$sledai_at_baseline,
breaks = c(-Inf, 4, 10, Inf),
labels = c("Mild", "Moderate", "High")
)
# Match samples
common_samples <- intersect(colnames(expr_data), rownames(pheno_clean))
expr_matched <- expr_data[, common_samples]
pheno_matched <- pheno_clean[common_samples, ]
# Transpose for PCA (samples = rows)
expr_t <- t(expr_matched)
pca_res <- prcomp(expr_t, scale. = TRUE)
pca_df <- as.data.frame(pca_res$x[, 1:2])
pca_df$sledai_group <- pheno_matched$sledai_group
library(ggplot2)
ggplot(pca_df, aes(PC1, PC2, color = sledai_group)) +
geom_point(size = 2, alpha = 0.7) +
stat_ellipse(type = "norm") +
scale_color_manual(values = c("Mild" = "#A6CEE3", "Moderate" = "#FDBF6F", "High" = "#E31A1C")) +
theme_minimal() +
labs(title = "PCA Colored by SLEDAI Severity Group",
subtitle = "Mild (0–4), Moderate (5–10), High (>10)")1. Cluster Overlap
The ellipses and points overlap heavily, which suggests:
There isn’t a strong, distinct separation between the groups based on just PC1 and PC2.
This implies gene expression profiles alone (in these two PCs) don’t cleanly distinguish disease severity.
However, there may still be subtle patterns, or separability in higher PCs or nonlinear space.
2. Gradient Trend
Mild group (blue) tends to occupy a more leftward region (lower PC1)
High group (red) is slightly more concentrated toward the center and right
This suggests a possible trend across severity, though weak — useful motivation for modeling!
3. Biological Implication
Since SLEDAI is a clinical composite score from different systems (CNS, renal, skin, etc.), the gene expression signal may be diluted or spread out.
But this still supports using machine learning, which can capture complex patterns beyond PC1/PC2.
loadings <- pca_res$rotation # genes x PCs
# Get top contributing genes for PC1
top_PC1 <- sort(abs(loadings[, 1]), decreasing = TRUE)[1:20]
top_PC1_genes <- names(top_PC1)
# Get top contributing genes for PC2
top_PC2 <- sort(abs(loadings[, 2]), decreasing = TRUE)[1:20]
top_PC2_genes <- names(top_PC2)
top_genes_df <- data.frame(
Gene = unique(c(top_PC1_genes, top_PC2_genes)),
PC1_Loading = loadings[unique(c(top_PC1_genes, top_PC2_genes)), 1],
PC2_Loading = loadings[unique(c(top_PC1_genes, top_PC2_genes)), 2]
)
# View the top loading genes
print(top_genes_df) Gene PC1_Loading PC2_Loading
FRMD1 FRMD1 0.0103265987 -2.488550e-04
RFLNA RFLNA 0.0103125699 -2.264373e-03
FLNC FLNC 0.0103111839 -1.548665e-03
TMEM184A TMEM184A 0.0103054779 6.337356e-05
CDH15 CDH15 0.0103025945 -7.434250e-04
PCSK4 PCSK4 0.0103019298 -2.212172e-03
FTCD FTCD 0.0102996653 3.134857e-05
P3H3 P3H3 0.0102992146 -1.817432e-03
ENTPD8 ENTPD8 0.0102840532 4.141963e-04
RGMA RGMA 0.0102773231 -1.841016e-03
PLEKHG4 PLEKHG4 0.0102698097 -9.853430e-04
BRF1 BRF1 0.0102697447 -6.386556e-04
PAPLN PAPLN 0.0102656088 2.446767e-04
CDH16 CDH16 0.0102575332 8.048503e-04
ADAMTS7 ADAMTS7 0.0102523961 -1.161742e-03
PCSK9 PCSK9 0.0102522658 5.459586e-04
HAUS7 HAUS7 0.0102510629 -2.783210e-03
TREX2 TREX2 0.0102510629 -2.783210e-03
GRM6 GRM6 0.0102487418 -7.325027e-04
PLXNB3 PLXNB3 0.0102443856 1.431472e-03
CSAD CSAD -0.0008695287 2.034106e-02
STX3 STX3 -0.0009448657 2.022802e-02
CFLAR CFLAR -0.0019990004 2.016113e-02
ANTXR2 ANTXR2 -0.0021284699 2.013467e-02
DLEU2 DLEU2 -0.0031356924 2.011404e-02
RNF130 RNF130 -0.0009918066 1.999769e-02
AREL1 AREL1 -0.0009697241 1.999621e-02
FAM13A-AS1 FAM13A-AS1 -0.0027400154 1.998136e-02
PTEN PTEN -0.0017053654 1.992487e-02
ZNF658 ZNF658 -0.0019231624 1.978870e-02
MIR15A MIR15A -0.0033335702 1.965325e-02
MIR16-1 MIR16-1 -0.0033335702 1.965325e-02
MRPS34 MRPS34 -0.0021521540 -1.960692e-02
RNF149 RNF149 -0.0022026984 1.960690e-02
SNORD89 SNORD89 -0.0022026984 1.960690e-02
LINC00678 LINC00678 -0.0009224228 1.948418e-02
CFLAR-AS1 CFLAR-AS1 -0.0007624817 1.945292e-02
TLN1 TLN1 -0.0012050808 1.943717e-02
LOC105375532 LOC105375532 -0.0008463255 1.940939e-02
LINC02166 LINC02166 -0.0022100190 1.939173e-02
✅ Simple
✅ Captures the most expressive genes
❌ May miss genes strongly related to SLEDAI but low variance
common_samples <- intersect(colnames(expr_data), rownames(pheno_clean))
expr_filtered <- expr_data[, common_samples]
pheno_filtered <- pheno_clean[common_samples, ]
# 3. Select top variable genes (e.g., top 500)
gene_variances <- apply(expr_filtered, 1, var)
top_genes <- names(sort(gene_variances, decreasing = TRUE))[1:500]
# 4. Create the final expression matrix for modeling (samples x genes)
expr_selected <- t(expr_filtered[top_genes, ])
expr_df <- as.data.frame(expr_selected)
expr_df$sledai <- pheno_filtered$sledai_at_baselineRandom Forest
library(caret)Loading required package: lattice
set.seed(42)
# Train-test split
train_idx <- createDataPartition(expr_df$sledai, p = 0.8, list = FALSE)
train_data <- expr_df[train_idx, ]
test_data <- expr_df[-train_idx, ]
# Train a Random Forest regressor
rf_model <- train(
sledai ~ ., data = train_data,
method = "rf",
trControl = trainControl(method = "cv", number = 5)
)# Predict on test set
pred <- predict(rf_model, test_data)
# Evaluate performance
results <- postResample(pred, obs = test_data$sledai)
print(results) # Includes RMSE, R-squared RMSE Rsquared MAE
3.3540065 0.0411851 2.6179820
✅ MAE/RMSE in the range of 2–3 points is a decent starting point, especially for a biological score like SLEDAI that has a limited range.
❌ But R² = 0.04 means the model is not capturing much of the variance — it’s only slightly better than guessing the mean SLEDAI for everyone.
# Select Top 300 Genes Correlated with SLEDAI
cor_vals <- apply(expr_filtered, 1, function(x) cor(x, pheno_filtered$sledai_at_baseline, use = "complete.obs"))
top_corr_genes <- names(sort(abs(cor_vals), decreasing = TRUE))[1:300]
expr_corr <- t(expr_filtered[top_corr_genes, ])
# Combine with SLEDAI score
expr_corr_df <- as.data.frame(expr_corr)
expr_corr_df$sledai <- pheno_filtered$sledai_at_baseline
set.seed(123)
train_idx <- createDataPartition(expr_corr_df$sledai, p = 0.8, list = FALSE)
train_data <- expr_corr_df[train_idx, ]
test_data <- expr_corr_df[-train_idx, ]
# Train Random Forest again
rf_model_corr <- train(
sledai ~ ., data = train_data,
method = "rf",
trControl = trainControl(method = "cv", number = 5)
)
# Predict and evaluate
pred_corr <- predict(rf_model_corr, test_data)
results_corr <- postResample(pred_corr, test_data$sledai)
print(results_corr) RMSE Rsquared MAE
3.5097314 0.1118972 2.5604374
| Metric | Value | Interpretation |
|---|---|---|
| RMSE | 3.51 | Average prediction error is ~3.5 points (slightly worse than before) |
| MAE | 2.56 | Average absolute error is ~2.56 points — a tiny improvement |
| R² | 0.112 | ✅ Now explains 11.2% of the variance in SLEDAI — more than 2x better than the previous model |